library(mosaic)
Loading required package: dplyr
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
Loading required package: lattice
Loading required package: ggformula
Loading required package: ggplot2
Loading required package: ggstance
Attaching package: ‘ggstance’
The following objects are masked from ‘package:ggplot2’:
geom_errorbarh, GeomErrorbarh
New to ggformula? Try the tutorials:
learnr::run_tutorial("introduction", package = "ggformula")
learnr::run_tutorial("refining", package = "ggformula")
Loading required package: mosaicData
Loading required package: Matrix
The 'mosaic' package masks several functions from core packages in order to add
additional features. The original behavior of these functions should not be affected by this.
Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
Attaching package: ‘mosaic’
The following object is masked from ‘package:Matrix’:
mean
The following object is masked from ‘package:ggplot2’:
stat
The following objects are masked from ‘package:dplyr’:
count, do, tally
The following objects are masked from ‘package:stats’:
binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test, quantile, sd, t.test, var
The following objects are masked from ‘package:base’:
max, mean, min, prod, range, sample, sum
library(tidyverse)
[30m── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mtibble [30m 2.1.3 [32m✓[30m [34mpurrr [30m 0.3.3
[32m✓[30m [34mtidyr [30m 1.0.2 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [32m✓[30m [34mforcats[30m 0.5.0[39m
[30m── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mmosaic[30m::[32mcount()[30m masks [34mdplyr[30m::count()
[31mx[30m [34mpurrr[30m::[32mcross()[30m masks [34mmosaic[30m::cross()
[31mx[30m [34mmosaic[30m::[32mdo()[30m masks [34mdplyr[30m::do()
[31mx[30m [34mtidyr[30m::[32mexpand()[30m masks [34mMatrix[30m::expand()
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mggstance[30m::[32mgeom_errorbarh()[30m masks [34mggplot2[30m::geom_errorbarh()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31mx[30m [34mtidyr[30m::[32mpack()[30m masks [34mMatrix[30m::pack()
[31mx[30m [34mmosaic[30m::[32mstat()[30m masks [34mggplot2[30m::stat()
[31mx[30m [34mmosaic[30m::[32mtally()[30m masks [34mdplyr[30m::tally()
[31mx[30m [34mtidyr[30m::[32munpack()[30m masks [34mMatrix[30m::unpack()[39m
library(lubridate)
Attaching package: ‘lubridate’
The following object is masked from ‘package:base’:
date
library(DataComputing)
library(rvest)
Loading required package: xml2
Attaching package: ‘rvest’
The following object is masked from ‘package:purrr’:
pluck
The following object is masked from ‘package:readr’:
guess_encoding
library(broom)
As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges– what factors of a country contribute to the spread of Coronavirus. The factors which we will analyze are: population density, and proximity to origin point (China).
Reading in the Data:
Data Source 1: COVID
COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
COVID %>%
nrow()
[1] 9487
COVID %>%
names()
[1] "total.covid.cases.deaths.per.million" "X" "X.1" "X.2"
[5] "X.3" "X.4" "X.5" "X.6"
[9] "X.7" "X.8" "X.9" "X.10"
[13] "X.11" "X.12" "X.13" "X.14"
[17] "X.15" "X.16" "X.17" "X.18"
[21] "X.19" "X.20" "X.21" "X.22"
[25] "X.23" "X.24" "X.25" "X.26"
[29] "X.27" "X.28" "X.29" "X.30"
[33] "X.31" "X.32" "X.33" "X.34"
[37] "X.35" "X.36" "X.37" "X.38"
[41] "X.39" "X.40" "X.41" "X.42"
[45] "X.43" "X.44" "X.45" "X.46"
[49] "X.47" "X.48" "X.49" "X.50"
[53] "X.51" "X.52" "X.53" "X.54"
[57] "X.55" "X.56" "X.57" "X.58"
[61] "X.59" "X.60" "X.61" "X.62"
[65] "X.63" "X.64" "X.65" "X.66"
[69] "X.67" "X.68" "X.69" "X.70"
[73] "X.71" "X.72" "X.73" "X.74"
[77] "X.75" "X.76" "X.77" "X.78"
[81] "X.79" "X.80" "X.81" "X.82"
[85] "X.83" "X.84" "X.85" "X.86"
[89] "X.87" "X.88" "X.89" "X.90"
[93] "X.91" "X.92" "X.93" "X.94"
[97] "X.95" "X.96" "X.97" "X.98"
[101] "X.99" "X.100" "X.101" "X.102"
[105] "X.103" "X.104" "X.105" "X.106"
[109] "X.107" "X.108" "X.109" "X.110"
[113] "X.111" "X.112" "X.113" "X.114"
[117] "X.115" "X.116" "X.117" "X.118"
[121] "X.119" "X.120" "X.121" "X.122"
[125] "X.123" "X.124" "X.125" "X.126"
[129] "X.127" "X.128" "X.129" "X.130"
[133] "X.131" "X.132" "X.133" "X.134"
[137] "X.135" "X.136" "X.137" "X.138"
[141] "X.139" "X.140" "X.141" "X.142"
[145] "X.143" "X.144" "X.145" "X.146"
[149] "X.147" "X.148" "X.149" "X.150"
[153] "X.151" "X.152" "X.153" "X.154"
[157] "X.155" "X.156" "X.157" "X.158"
[161] "X.159" "X.160" "X.161" "X.162"
[165] "X.163" "X.164" "X.165" "X.166"
[169] "X.167" "X.168" "X.169" "X.170"
[173] "X.171" "X.172" "X.173" "X.174"
[177] "X.175" "X.176" "X.177" "X.178"
[181] "X.179" "X.180" "X.181" "X.182"
[185] "X.183" "X.184" "X.185" "X.186"
[189] "X.187" "X.188" "X.189" "X.190"
[193] "X.191" "X.192" "X.193" "X.194"
[197] "X.195" "X.196" "X.197" "X.198"
[201] "X.199" "X.200" "X.201" "X.202"
[205] "X.203" "X.204" "X.205" "X.206"
[209] "X.207" "X.208" "X.209" "X.210"
[213] "X.211" "X.212" "X.213" "X.214"
[217] "X.215" "X.216" "X.217" "X.218"
[221] "X.219" "X.220" "X.221" "X.222"
[225] "X.223" "X.224" "X.225" "X.226"
[229] "X.227" "X.228" "X.229" "X.230"
[233] "X.231" "X.232" "X.233" "X.234"
[237] "X.235" "X.236" "X.237" "X.238"
[241] "X.239" "X.240" "X.241" "X.242"
[245] "X.243" "X.244" "X.245" "X.246"
[249] "X.247" "X.248" "X.249" "X.250"
[253] "X.251" "X.252" "X.253" "X.254"
COVID %>%
head()
Data Source 2: CountryData
CountryData
CountryData %>%
nrow()
[1] 256
CountryData %>%
names()
[1] "country" "area" "pop" "growth" "birth" "death" "migr" "maternal"
[9] "infant" "life" "fert" "health" "HIVrate" "HIVpeople" "HIVdeath" "obesity"
[17] "underweight" "educ" "unemploymentYouth" "GDP" "GDPgrowth" "GDPcapita" "saving" "indProd"
[25] "labor" "unemployment" "family" "tax" "budget" "debt" "inflation" "discount"
[33] "lending" "narrow" "broad" "credit" "shares" "balance" "exports" "imports"
[41] "gold" "externalDebt" "homeStock" "abroadStock" "elecProd" "elecCons" "elecExp" "elecImp"
[49] "elecCap" "elecFossil" "elecNuc" "elecHydro" "elecRenew" "oilProd" "oilExp" "oilImp"
[57] "oilRes" "petroProd" "petroCons" "petroExp" "petroImp" "gasProd" "gasCons" "gasExp"
[65] "gasImp" "gasRes" "mainlines" "cell" "netHosts" "netUsers" "airports" "railways"
[73] "roadways" "waterways" "marine" "military"
CountryData %>%
head()
Data Source 3: Continents
Continents <- read.csv(file = "countries and continents.csv")
Continents
Continents %>%
nrow()
[1] 251
Continents %>%
names()
[1] "name" "official_name_en" "official_name_fr" "ISO3166.1.Alpha.2" "ISO3166.1.Alpha.3"
[6] "M49" "ITU" "MARC" "WMO" "DS"
[11] "Dial" "FIFA" "FIPS" "GAUL" "IOC"
[16] "ISO4217.currency_alphabetic_code" "ISO4217.currency_country_name" "ISO4217.currency_minor_unit" "ISO4217.currency_name" "ISO4217.currency_numeric_code"
[21] "is_independent" "Capital" "Continent" "TLD" "Languages"
[26] "Geoname.ID" "EDGAR"
Continents %>%
head()
COVID
Since we are soley focused on the spread of COVID-19, filter out death count.
TidyCOVID <- COVID %>%
rename(country = total.covid.cases.deaths.per.million ) %>%
rename( Code = X ) %>%
rename(Date = X.1 ) %>%
rename(CasesPerMillion = X.3) %>%
filter(row_number() > 1) %>%
subset(select = c(1,3,5)) %>%
mutate( country = as.character(country) ) %>%
mutate(Date = mdy(Date)) %>%
mutate(CasesPerMillion = as.integer(CasesPerMillion) - 1)
TidyCOVID
RelevantCountryData <-
CountryData %>%
subset(select = c(1,2,3)) %>%
mutate(popdensity = round(pop/area, digits = 2))
MasterData <- left_join(TidyCOVID, RelevantCountryData)
Joining, by = "country"
MasterData <-
MasterData %>%
filter(country != "Africa",
country != "Asia",
country != "Europe",
country != "North America",
country != "Oceania",
country != "South America",
country != "World"
) %>%
mutate("Cases" = (CasesPerMillion * round(pop/1000000, digits = 0)))
MasterData
FirstInstance <-
MasterData %>%
filter(Cases != 0) %>%
group_by(country) %>%
summarise(beginningofspread = min(Date))
FirstInstance
DailySpread <-
left_join(MasterData, FirstInstance) %>%
filter(Date == "2020-04-05") %>%
mutate(dayselapsed = Date - beginningofspread) %>%
mutate(dailyspread = Cases / as.numeric(dayselapsed) ) %>%
arrange(desc(dailyspread))
Joining, by = "country"
MasterData <-
left_join(MasterData, DailySpread)
Joining, by = c("country", "Date", "CasesPerMillion", "area", "pop", "popdensity", "Cases")
ggplot(data=MasterData,aes(x=pop,y=dailyspread))+geom_point()
MasterData
MasterData %>%
group_by(Date) %>%
summarise(totalcases = sum(Cases, na.rm=T)) %>%
ggplot(aes(x = Date, y = totalcases)) +
geom_point()
MasterData %>%
arrange(desc(dailyspread)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(dailyspread)), y= dailyspread)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
xlab("Country") +
ylab("Spread of COVID-19 Cases Per Day")